{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>boolean</th>\n",
       "      <th>ordinal_column</th>\n",
       "      <th>quantitative_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>None</td>\n",
       "      <td>no</td>\n",
       "      <td>like</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>london</td>\n",
       "      <td>None</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>-0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>seattle</td>\n",
       "      <td>no</td>\n",
       "      <td>like</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>no</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>dislike</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city boolean ordinal_column  quantitative_column\n",
       "0          tokyo     yes  somewhat like                  1.0\n",
       "1           None      no           like                 11.0\n",
       "2         london    None  somewhat like                 -0.5\n",
       "3        seattle      no           like                 10.0\n",
       "4  san francisco      no  somewhat like                  NaN\n",
       "5          tokyo     yes        dislike                 20.0"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'san francisco', 'tokyo'], \n",
    "                  'boolean':['yes', 'no', None, 'no', 'no', 'yes'], \n",
    "                  'ordinal_column':['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'], \n",
    "                  'quantitative_column':[1, 11, -.5, 10, None, 20]})\n",
    "X"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- boolean: 是二元分类数据(是/否)，定类等级\n",
    "- city: 此列是分类数据，也是定类等级\n",
    "- ordinal_column: 顺序数据，定序等级\n",
    "- quantitative_column: 整数，定比等级"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "city                   1\n",
       "boolean                1\n",
       "ordinal_column         0\n",
       "quantitative_column    1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'tokyo'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X['city'].value_counts().index[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0            tokyo\n",
       "1            tokyo\n",
       "2           london\n",
       "3          seattle\n",
       "4    san francisco\n",
       "5            tokyo\n",
       "Name: city, dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X['city'].fillna(X['city'].value_counts().index[0])  # fill empty slots with most common category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import TransformerMixin\n",
    "\n",
    "class CustomCategoryImputer(TransformerMixin):\n",
    "    def __init__(self, cols=None):\n",
    "        self.cols=cols\n",
    "    \n",
    "    def transform(self, df):\n",
    "        X = df.copy()\n",
    "        for col in self.cols:\n",
    "            X[col].fillna(X[col].value_counts().index[0], inplace=True)\n",
    "        return X\n",
    "    \n",
    "    def fit(self, *_):\n",
    "        return self"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>boolean</th>\n",
       "      <th>ordinal_column</th>\n",
       "      <th>quantitative_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>no</td>\n",
       "      <td>like</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>london</td>\n",
       "      <td>no</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>-0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>seattle</td>\n",
       "      <td>no</td>\n",
       "      <td>like</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>no</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>dislike</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city boolean ordinal_column  quantitative_column\n",
       "0          tokyo     yes  somewhat like                  1.0\n",
       "1          tokyo      no           like                 11.0\n",
       "2         london      no  somewhat like                 -0.5\n",
       "3        seattle      no           like                 10.0\n",
       "4  san francisco      no  somewhat like                  NaN\n",
       "5          tokyo     yes        dislike                 20.0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cci = CustomCategoryImputer(cols=['city', 'boolean'])\n",
    "cci.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import Imputer\n",
    "\n",
    "class CustomQuantitativeImputer(TransformerMixin):\n",
    "    def __init__(self, cols=None, strategy='mean'):\n",
    "        self.cols=cols\n",
    "        self.strategy = strategy\n",
    "        \n",
    "    def transform(self, df):\n",
    "        df.copy()\n",
    "        impute = Imputer(strategy=self.strategy)\n",
    "        for col in self.cols:\n",
    "            X[col] = impute.fit_transform(X[col].values.reshape(-1, 1))\n",
    "        return X\n",
    "    \n",
    "    def fit(self, *_):\n",
    "        return self"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "cqi = CustomQuantitativeImputer(cols=['quantitative_column'], strategy='mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/super/anaconda3/lib/python3.7/site-packages/sklearn/utils/deprecation.py:66: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.\n",
      "  warnings.warn(msg, category=DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>boolean</th>\n",
       "      <th>ordinal_column</th>\n",
       "      <th>quantitative_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>no</td>\n",
       "      <td>like</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>london</td>\n",
       "      <td>no</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>-0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>seattle</td>\n",
       "      <td>no</td>\n",
       "      <td>like</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>no</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>8.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>dislike</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city boolean ordinal_column  quantitative_column\n",
       "0          tokyo     yes  somewhat like                  1.0\n",
       "1          tokyo      no           like                 11.0\n",
       "2         london      no  somewhat like                 -0.5\n",
       "3        seattle      no           like                 10.0\n",
       "4  san francisco      no  somewhat like                  8.3\n",
       "5          tokyo     yes        dislike                 20.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "imputer = Pipeline([('quant', cqi), ('category', cci)])\n",
    "imputer.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CustomDummifier(TransformerMixin):\n",
    "    def __init__(self, cols=None):\n",
    "        self.cols = cols\n",
    "        \n",
    "    def transform(self, X):\n",
    "        return pd.get_dummies(X, columns=self.cols)\n",
    "    \n",
    "    def fit(self, *_):\n",
    "        return self"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ordinal_column</th>\n",
       "      <th>quantitative_column</th>\n",
       "      <th>boolean_no</th>\n",
       "      <th>boolean_yes</th>\n",
       "      <th>city_london</th>\n",
       "      <th>city_san francisco</th>\n",
       "      <th>city_seattle</th>\n",
       "      <th>city_tokyo</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>somewhat like</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>like</td>\n",
       "      <td>11.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>somewhat like</td>\n",
       "      <td>-0.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>like</td>\n",
       "      <td>10.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>somewhat like</td>\n",
       "      <td>8.3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>dislike</td>\n",
       "      <td>20.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  ordinal_column  quantitative_column  boolean_no  boolean_yes  city_london  \\\n",
       "0  somewhat like                  1.0           0            1            0   \n",
       "1           like                 11.0           1            0            0   \n",
       "2  somewhat like                 -0.5           0            0            1   \n",
       "3           like                 10.0           1            0            0   \n",
       "4  somewhat like                  8.3           1            0            0   \n",
       "5        dislike                 20.0           0            1            0   \n",
       "\n",
       "   city_san francisco  city_seattle  city_tokyo  \n",
       "0                   0             0           1  \n",
       "1                   0             0           0  \n",
       "2                   0             0           0  \n",
       "3                   0             1           0  \n",
       "4                   1             0           0  \n",
       "5                   0             0           1  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cd = CustomDummifier(cols=['boolean', 'city'])\n",
    "\n",
    "cd.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    somewhat like\n",
       "1             like\n",
       "2    somewhat like\n",
       "3             like\n",
       "4    somewhat like\n",
       "5          dislike\n",
       "Name: ordinal_column, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ordering = ['dislike', 'somewhat like', 'like']  # 0 for dislike, 1 for somewhat like, and 2 for like\n",
    "\n",
    "X['ordinal_column']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CustomEncoder(TransformerMixin):\n",
    "    def __init__(self, col, ordering=None):\n",
    "        self.ordering = ordering\n",
    "        self.col = col\n",
    "        \n",
    "    def transform(self, df):\n",
    "        X = df.copy()\n",
    "        X[self.col] = X[self.col].map(lambda x: self.ordering.index(x))\n",
    "        return X\n",
    "    \n",
    "    def fit(self, *_):\n",
    "        return self"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>boolean</th>\n",
       "      <th>ordinal_column</th>\n",
       "      <th>quantitative_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>None</td>\n",
       "      <td>no</td>\n",
       "      <td>2</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>london</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>seattle</td>\n",
       "      <td>no</td>\n",
       "      <td>2</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>no</td>\n",
       "      <td>1</td>\n",
       "      <td>8.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>0</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city boolean  ordinal_column  quantitative_column\n",
       "0          tokyo     yes               1                  1.0\n",
       "1           None      no               2                 11.0\n",
       "2         london    None               1                 -0.5\n",
       "3        seattle      no               2                 10.0\n",
       "4  san francisco      no               1                  8.3\n",
       "5          tokyo     yes               0                 20.0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ce = CustomEncoder(col='ordinal_column', ordering = ['dislike', 'somewhat like', 'like'])\n",
    "\n",
    "ce.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CustomCutter(TransformerMixin):\n",
    "    def __init__(self, col, bins, labels=False):\n",
    "        self.labels = labels\n",
    "        self.bins = bins\n",
    "        self.col = col\n",
    "        \n",
    "    def transform(self, df):\n",
    "        X = df.copy()\n",
    "        X[self.col] = pd.cut(X[self.col], bins=self.bins, labels=self.labels)\n",
    "        return X\n",
    "    \n",
    "    def fit(self, *_):\n",
    "        return self"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>boolean</th>\n",
       "      <th>ordinal_column</th>\n",
       "      <th>quantitative_column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>None</td>\n",
       "      <td>no</td>\n",
       "      <td>like</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>london</td>\n",
       "      <td>None</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>seattle</td>\n",
       "      <td>no</td>\n",
       "      <td>like</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>no</td>\n",
       "      <td>somewhat like</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>tokyo</td>\n",
       "      <td>yes</td>\n",
       "      <td>dislike</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city boolean ordinal_column  quantitative_column\n",
       "0          tokyo     yes  somewhat like                    0\n",
       "1           None      no           like                    1\n",
       "2         london    None  somewhat like                    0\n",
       "3        seattle      no           like                    1\n",
       "4  san francisco      no  somewhat like                    1\n",
       "5          tokyo     yes        dislike                    2"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cc = CustomCutter(col='quantitative_column', bins=3)\n",
    "\n",
    "cc.fit_transform(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. 拓展数值特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('~/开发/AI相关/特征工程/Feature-Engineering-Made-Easy-master/data/activity_recognizer/1.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1502</th>\n",
       "      <th>2215</th>\n",
       "      <th>2153</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1667</td>\n",
       "      <td>2072</td>\n",
       "      <td>2047</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1611</td>\n",
       "      <td>1957</td>\n",
       "      <td>1906</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1601</td>\n",
       "      <td>1939</td>\n",
       "      <td>1831</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1643</td>\n",
       "      <td>1965</td>\n",
       "      <td>1879</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>1604</td>\n",
       "      <td>1959</td>\n",
       "      <td>1921</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0  1502  2215  2153  1\n",
       "0  1.0  1667  2072  2047  1\n",
       "1  2.0  1611  1957  1906  1\n",
       "2  3.0  1601  1939  1831  1\n",
       "3  4.0  1643  1965  1879  1\n",
       "4  5.0  1604  1959  1921  1"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "      <th>activity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1667</td>\n",
       "      <td>2072</td>\n",
       "      <td>2047</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1611</td>\n",
       "      <td>1957</td>\n",
       "      <td>1906</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1601</td>\n",
       "      <td>1939</td>\n",
       "      <td>1831</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1643</td>\n",
       "      <td>1965</td>\n",
       "      <td>1879</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>1604</td>\n",
       "      <td>1959</td>\n",
       "      <td>1921</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   index     x     y     z  activity\n",
       "0    1.0  1667  2072  2047         1\n",
       "1    2.0  1611  1957  1906         1\n",
       "2    3.0  1601  1939  1831         1\n",
       "3    4.0  1643  1965  1879         1\n",
       "4    5.0  1604  1959  1921         1"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns = ['index', 'x', 'y', 'z', 'activity']\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1910.6733723076923"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['x'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/super/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n",
      "  warnings.warn(CV_WARNING, FutureWarning)\n",
      "/Users/super/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:657: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7205230769230769 {'n_neighbors': 5}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "X = df[['x', 'y', 'z']]\n",
    "y = df['activity']\n",
    "\n",
    "knn_params = {'n_neighbors':[3,4,5,6]}\n",
    "knn = KNeighborsClassifier()\n",
    "grid = GridSearchCV(knn, knn_params)\n",
    "grid.fit(X, y)\n",
    "\n",
    "print(grid.best_score_, grid.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(162500, 9)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only = False)\n",
    "X_poly = poly.fit_transform(X)\n",
    "X_poly.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x0</th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x0^2</th>\n",
       "      <th>x0 x1</th>\n",
       "      <th>x0 x2</th>\n",
       "      <th>x1^2</th>\n",
       "      <th>x1 x2</th>\n",
       "      <th>x2^2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1667.0</td>\n",
       "      <td>2072.0</td>\n",
       "      <td>2047.0</td>\n",
       "      <td>2778889.0</td>\n",
       "      <td>3454024.0</td>\n",
       "      <td>3412349.0</td>\n",
       "      <td>4293184.0</td>\n",
       "      <td>4241384.0</td>\n",
       "      <td>4190209.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1611.0</td>\n",
       "      <td>1957.0</td>\n",
       "      <td>1906.0</td>\n",
       "      <td>2595321.0</td>\n",
       "      <td>3152727.0</td>\n",
       "      <td>3070566.0</td>\n",
       "      <td>3829849.0</td>\n",
       "      <td>3730042.0</td>\n",
       "      <td>3632836.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1601.0</td>\n",
       "      <td>1939.0</td>\n",
       "      <td>1831.0</td>\n",
       "      <td>2563201.0</td>\n",
       "      <td>3104339.0</td>\n",
       "      <td>2931431.0</td>\n",
       "      <td>3759721.0</td>\n",
       "      <td>3550309.0</td>\n",
       "      <td>3352561.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1643.0</td>\n",
       "      <td>1965.0</td>\n",
       "      <td>1879.0</td>\n",
       "      <td>2699449.0</td>\n",
       "      <td>3228495.0</td>\n",
       "      <td>3087197.0</td>\n",
       "      <td>3861225.0</td>\n",
       "      <td>3692235.0</td>\n",
       "      <td>3530641.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1604.0</td>\n",
       "      <td>1959.0</td>\n",
       "      <td>1921.0</td>\n",
       "      <td>2572816.0</td>\n",
       "      <td>3142236.0</td>\n",
       "      <td>3081284.0</td>\n",
       "      <td>3837681.0</td>\n",
       "      <td>3763239.0</td>\n",
       "      <td>3690241.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       x0      x1      x2       x0^2      x0 x1      x0 x2       x1^2  \\\n",
       "0  1667.0  2072.0  2047.0  2778889.0  3454024.0  3412349.0  4293184.0   \n",
       "1  1611.0  1957.0  1906.0  2595321.0  3152727.0  3070566.0  3829849.0   \n",
       "2  1601.0  1939.0  1831.0  2563201.0  3104339.0  2931431.0  3759721.0   \n",
       "3  1643.0  1965.0  1879.0  2699449.0  3228495.0  3087197.0  3861225.0   \n",
       "4  1604.0  1959.0  1921.0  2572816.0  3142236.0  3081284.0  3837681.0   \n",
       "\n",
       "       x1 x2       x2^2  \n",
       "0  4241384.0  4190209.0  \n",
       "1  3730042.0  3632836.0  \n",
       "2  3550309.0  3352561.0  \n",
       "3  3692235.0  3530641.0  \n",
       "4  3763239.0  3690241.0  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(X_poly, columns=poly.get_feature_names()).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x1a1efa2f98>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAD8CAYAAAC4uSVNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAd0ElEQVR4nO3df5BdZZ3n8fcnIdlAgiEBZwoIKjsElQkYd5KgjMgPMQSmAGtqXINAAoKZsmTjiLUzrvJDYywFd3ZHSpixYTLSEHCUdbYaCMaAKKkNcbpVJpNEITHOSm/QjBAoBSJ093f/OE/ketJ977nd956b0/15Uae459fzPffm1vc+/ZznPI8iAjMzq5ZJnb4AMzNrnpO3mVkFOXmbmVWQk7eZWQU5eZuZVZCTt5lZBTl5m5m1kaQ1kvZI2jrCfkm6WdJOSVsk/aci5Tp5m5m111eAJXX2nwfMTcsK4G+LFOrkbWbWRhHxKPBsnUMuArojsxk4QtLRjco9pFUX2G6v/HJX6Y+CHnbM6WWHBODps07oSNyZa/+h9JivrL2p9JgAOvGkjsQ9/KIbOxL3oVmnlR5z3vw9pcfc76j139VYzm8m30x97R/8OVmNeb+uiOhqItyxwFM16/1p29P1TqpM8jYzOxilRN1Mss4b7oem4Y+Hk7eZWd7QYJnR+oHjatbnALsbneQ2bzOzvMGB4svY9QDLUq+TtwHPR0TdJhNwzdvM7AARQy0rS9I9wJnAUZL6gRuAKVmc+DtgHXA+sBN4EbiiSLlO3mZmeUOtS94RcXGD/QF8uNlynbzNzPJaWPNuFydvM7O8cm9YjoqTt5lZXgVq3h3pbSJpuaQdaVneiWswMxtJDA4UXjql9Jq3pNlkd1sXkHVE/76knojYW/a1mJkNq4U3LNulrTVvSQvTKFnTJE2XtI3sruqGiHg2JewN1B+0xcysXDFUfOmQtibviOgl64C+GrgJuAt4ieGf4z+ApBWS+iT13d59Tzsv1czsVUODxZcOKaPZZBXQC+wDVgLXDHPMsM/x144Z0ImBqcxsgqrADcsykvdsYAbZE0XTyGraZ9bsnwN8p4TrMDMrpoM3Iosqo7dJF3AdsBa4EVgPLJY0S9IsYHHaZmZ2cBgaKr50SFtr3pKWAQMRcbekycAmYD7wGbKmFIBVEVFvoHIzs1JFTPCHdCKiG+hOrweBU2t2r2lnbDOzUXObt5lZBVWgn7eTt5lZnmveZmYVNPhKp6+gISdvM7M8N5uYmVWQm03MzCrINe/WOeyY00uP+eLujaXHBHj7yZ0ZJfdfOvAZ7z7jhNJjAszv68y/7Usd+k794k+uKj3mki2dS4B9Yy3AydvMrHrCNyzNzCrIbd5mZhXkZhMzswpyzdvMrIJc8zYzqyDXvM3MKmjg4J+MwcnbzCyvAjXvMmbSOYCkb0p6TtL9nYhvZlZXBWbS6UjyBr4AXNah2GZm9cVQ8aUASUskPSFpp6SPD7P/dZIekfRDSVsknd+ozLYmb0kL04VMkzRd0jZJ8yLiYeBX7YxtZjZqLax5pykgbwHOA04CLpZ0Uu6wa4GvRcRbgaXArY3Kbfc0aL2SeoDVwKHAXRGxtZ0xzczGrLVt3ouAnRGxC0DSV4GLgO21EYHXpNczgd2NCi3jhuUqssmG9wErmzlR0gpgBcCkyTOZNGl666/OzCyvid4mtXkq6YqIrpr1Y4Gnatb7+d35fAE+BXxL0n8BpgPnNIpbRvKeDcwApgDTgBeKnpg+gC6AKVOPjbZcnZlZXhRPN7V5agQa7rTc+sXAVyLiryW9HbgzNTGP+CdAGTcsu4DrgLXAjSXEMzMbm9b2NukHjqtZn8OBzSJXAl8DiIjHyCq6R9UrtN03LJcBAxFxN/B5YKGksyVtBL4OvEtSv6Rz23kdZmZNaW3y7gXmSjpe0lSyG5I9uWN+BrwLQNKbyZL3v9crtN03LLuB7vR6kFfbeb7dzrhmZmPSwhuWETEg6WpgPTAZWBMR2yStAvoiogf4GHCbpI+SNalcHlG/7cZPWJqZ5Q0OtrS4iFgHrMttu77m9Xbgj5sp08nbzCzPowqamVWQk7eZWQVVYGAqJ28zs5wYOvgfK3HyNjPLc7NJ6zx91gmlx3z7yctLjwnw2L/e0ZG4z192Rekx9zw5o/SYAD/5zl91JG6nvlMb77yk9JiPzV9cesyWaXFvk3aoTPI2MyuNa95mZhXk5G1mVkFNDEzVKU7eZmZ5rnmbmVWQuwqamVWQe5uYmVVPuNnEzKyC3GxiZlZBFRjbpIxp0H6HpPmSHpO0TdIWSe8r+xrMzOoaiuJLh3Si5v0isCwidkg6Bvi+pPUR8VwHrsXM7EADB/8Ny3bPYbkw1a6nSZouaRswNSJ2AETEbmAP8Np2XoeZWVNiqPjSIW1N3hHRSzbR5mrgJuCuiNi6f7+kRcBU4CfDnS9phaQ+SX3d/U+381LNzF7lZhMAVpHNnrwPWLl/o6SjgTuB5RHD/3xFRBfQBfDLc884+G//mtm44K6CmdnADGAK2XT2L0h6DfAAcG1EbC7hGszMiqtAV8Eyept0AdcBa4EbJU0F/gnojoivlxDfzKw5E73ZRNIyYCAi7pY0GdgELAXeCRwp6fJ06OUR8Xg7r8XMrLCJ/nh8RHQD3en1IHBq2tXdzrhmZmPhOSzNzKrIydvMrILc28TMrIIqUPMufWwTM7ODXot7m0haIukJSTslfXyEY/6zpO1p3Ke7G5XpmreZWU4Mtq7ZJPW0uwV4N9AP9ErqiYjtNcfMBf4b8McRsVfS7zUqtzLJe+bafyg95r8cc3rpMQGev+yKjsSdeWf5n/Fha28qPSZA/HxXR+I+/kxn4vZd+lDpMU/6o3tKj7nfkQ98d2wFtLbZZBGwMyJ2AUj6KnARsL3mmA8Ct0TEXoCI2NOoUDebmJnlxFAUXgo4FniqZr0/bat1InCipP8jabOkJY0KrUzN28ysNE3UvCWtAFbUbOpK4zL99pBhTssHOASYC5wJzAE2SppXb6hsJ28zs7wmmrxrB9AbQT9wXM36HGD3MMdsjohXgJ9KeoIsmfeOVKibTczMcmJgqPBSQC8wV9LxaWynpWRDZdf638BZAJKOImtGqXuDxDVvM7O8Fj6jExEDkq4G1gOTgTURsU3SKqAvInrSvsWStgODwH+NiGfqlevkbWaW0+qxTSJiHbAut+36mtcBXJOWQpy8zczyDv6n4528zczyPKqgmVkVVaDmPereJpKWS9qRluXD7P+ypBcknZ3bfk16fn+LpIclvX6012Bm1g4xUHzplFElb0mzgRvIJldYBNwgaVbN/muBWWn/LZJOqTn9h8CCiDgFuJdsVnkzs4NGDBVfOqVh8pa0MNWSp0maLmkb8GFgQ0Q8m57F3wAsSccvB+YB74+IrcCFwG2SjgOIiEci4sVU/GayDutmZgePoSaWDmmYvCOil6xD+WqyWvJdwEuM8Kx+RNwREUsjsj8oImJHRJwaEU9xoCuBB0eKLWmFpD5Jfbd3d26QGzObWKpQ8y56w3IV2VNC+4CVDN8Xsanbs5IuBRYAZ4x0TO1jp6/8ctfBf/vXzMaFTiblooq2ec8GZgCHA9Mo9qz+iCSdA3wSuDAiflP0PDOzMsSgCi+dUjR5dwHXAWuBG3n1Uc5Z6Ubl4rStIUlvBb5MlrgbjllrZla2cdFsImkZMBARd6cZITYB84HP8OqIV6si4tmCMb9AVov/uiSAn0XEhU1fuZlZm8RQ52rURTVM3hHRDXSn14Nk3f/2W9NswIg4p9lzzMzKVIU2bz9haWaWEzEOat5mZhONa95mZhU01MFeJEU5eZuZ5YyLG5ZmZhONk7eZWQVFBZ7nrkzyfmVt+YMP7j7jhNJjAux5ckZH4h7Wgc94yiV/WXpMgH2rP9KRuLtP78x3avrbyq9J/uxr00uPud+RYzzfNW8zswpyV0EzswoadG8TM7Pqcc3bzKyC3OZtZlZB7m1iZlZBrnmbmVXQ4NCo5mYvlZO3mVlOFZpNRv3zImm5pB1pWd7kuZ+V9JSkX482vplZuwyFCi+dMqrkLWk2cAPZxAyLgBvSdGhF3ZfOMzM76ESo8FKEpCWSnpC0U9LH6xz3Z5JC0oJGZdZN3pIWStoiaZqk6ZK2SZoHnAtsiIhnI2IvsAFYkjt3ZrrYN6b1eyR9ECAiNkfE043fsplZ+SKKL42k6SNvAc4DTgIulnTSMMcdDqwEvlfkGusm74joBXqA1cBNwF0RsRU4Fniq5tD+tK323OeBq4GvSFoKzIqI24pc1H6SVkjqk9S3ZtP2Zk41Mxu1FjebLAJ2RsSuiHgZ+Cpw0TDHfYYsz+4rUmiRZpNVwLuBBalggOGu+IDfoIjYAPwr2a/OVUUuKHd+V0QsiIgFHzjtgB8qM7O2GByaVHiprWSmZUWuuIaVXUlvBY6LiPuLXmOR3iazyWZ7nwJMA15Iwc+sOWYO8J38iZImAW8GXkrl9Be9MDOzTmmms0lEdAFddQ6pW9lNefJ/Apc3EbZQzbsLuA5YC9yYtq0HFkualW5ULk7b8j4K/Ai4GFgjaUozF2dm1gktbjbpB46rWZ8D7K5ZPxyYB3xH0r8BbwN6Gt20bHTDchkwEBF3A58HFko6OyKeJWuf6U3LqrSt9twTyZpKPhYRG4FHgWvTvpsk9QOHSeqX9Kl612FmVqYW9zbpBeZKOl7SVGAp2b3EFCuej4ijIuINEfEGYDNwYUT01Su0brNJRHQD3en1IFnXwP371gBr6pz7JFmTyf71a2pe/yXQmVH4zcwaaOXk8RExIOlqstaJycCaiNgmaRXQFxE99UsYnp+wNDPLiWGbqcdQXsQ6YF1u2/UjHHtmkTKdvM3McgY8nreZWfW0uubdDk7eZmY5rWzzbhcnbzOzHNe8zcwqyDXvFtKJ5T8eP79vY+kxAX7ynb/qSNz4+a7SY+5b/ZHSYwJMu/aLHYn7prkXdCTujvf8Yekxj7/3itJjtsqga95mZtVTgVnQnLzNzPKGXPM2M6ueCsyC5uRtZpbnG5ZmZhU0JDebmJlVzmCnL6AAJ28zsxz3NjEzq6Aq9DYpMpPOsCQtl7QjLcubOO8wSQ9I+nGajf7zo70GM7N2iCaWThlVzVvSbOAGskmJA/i+pJ6I2FuwiP8eEY+kWSUelnReRDw4mmsxM2u1KjSbNJoGbaGkLZKmSZqeasrzgHOBDRHxbErYG4AluXNnSnpC0hvT+j2SPhgRL0bEIwAR8TLwA7I53czMDgpDTSydUjd5R0Qv2Vxrq4GbgLsiYisFprKPiOeBq4GvSFoKzIqI22qPkXQEcAHw8HDxJa2Q1Cep7+8f3NTUGzMzG61BFV86pUizySqyCTT3ASvTtrpT2f92Q8QGSe8FbgHeUrtP0iHAPcDNETHsiEgR0UU2ez0vPXhzFR56MrNxoAoP6RS5YTkbmEE2Pf20tK3RVPYASJpENgnxS6mcWl3Ajoj4myav2cysrSrfbJJ0AdcBa4Eb07b1wGJJsyTNAhanbXkfBX4EXAyskTQFQNJqYCbwF2O7fDOz1gsVXzqlbrOJpGXAQETcLWkysEnS2RHxbUmfIWtOAVgVEc/mzj0RuApYFBG/kvQocK2k24BPAj8GfqDsMdQvRcTtrX1rZmajU4Vmk7rJOyK6ge70ehA4tWbfGmBNnXOfJGsy2b9+Tc3uCnTEMbOJyo/Hm5lVUBX6eTt5m5nlVL7ZxMxsInLyNjOroCo8VOLkbWaWU4U271GPKmhmNl4NNrEUIWlJGutpp6SPD7P/Gknb01hSD0t6faMyK1PzPvyiGxsf1GIv7d5YekyAt59ceITdlnr8mWFHKWir3aefUHpMgDfNvaAjcf9tx30dibvngqtKj/mez20tPeZ+vbvPH9P5Qy1sOEnPyNwCvJvs6fTeNArr9prDfggsiIgXJX2IbCyp99Ur1zVvM7OcFj8evwjYGRG70kiqXwUuqj0gIh6JiBfT6mYKjLTq5G1mltPMZAy1o5+mZUWuuIajsOZcCTSc36AyzSZmZmVppqtg7einIyg0CiuApEvJJrk5o1FcJ28zs5wBtbSzYNFRWM8hG/fpjIj4TaNC3WxiZpbT4jkse4G5ko5PUz8uJZvk5rckvRX4MnBhROwpUqhr3mZmOa18wjIiBiRdTTZs9mRgTURsk7QK6IuIHuALZPMmfD2NtPqziLiwXrlO3mZmOa3sKggQEeuAdblt19e8PqfZMp28zcxyqvB4/KjbvCV9U9Jzku4fYf8nJL0s6bLc9kvSU0RbJG2S9Jbhzjcz65QqTIM2lpr3F4DDgD/P70jdXc4FTgLulfTziNiQdv+U7G7qXknnkXWxOTVfhplZpwxWoO7dsOYtaWGqJU+TNF3SNknzIuJh4FfDHH8OsBw4PyJ2ks1v+WlJ8wEiYlNE7E2HF3qSyMysTOOi5h0RvZJ6gNXAocBdETHioAUR8RDwUM36HuC0EQ6v+yRRelJpBYAmz2TSpOmNLtfMbMyiAjXvos0mq8j6Ku4DVrYisKSzyJL3O0Y6pvbJpUOmHnvwf5pmNi6Mp8kYZpP1QZwCTANeGEtQSacAtwPnRcQzYynLzKzVWt1VsB2K9jbpAq4D1gJjGptV0uuAbwCXpRnmzcwOKi1+wrItGta8JS0DBiLi7jQu7SZJZwOfBt4EzJDUD1wZEesLxLweOBK4NT1JNBARC0b9DszMWmygAjXvIjcsu4Hu9HqQV7v1fXs0ASPiKqD8keHNzAoaTzcszcwmjPF0w9LMbMJwzdvMrIJc8zYzq6DBcM3bzKxyqtDP28nbzCzHbd4t9NCskYZHaZ9f/ElnejRuvPOSjsTtu/Shxge12PS3DTc3a/vteM8fdiTungs68536vftuLz3mo49/q/SYreI2bzOzCnKziZlZBbnZxMysgtzbxMysgtxsYmZWQb5haWZWQW7zNjOrIDebmJlVUFTghmXRmXQOIOmbkp6TdP8ozl0r6QlJWyWtkTRltNdhZtZqg0ThpVNGnbyBLwCXjfLctWSz8JxMNiO9J2cws4PGEFF46ZS6yVvSQklbJE2TNF3SNknzACLiYeBXdc49RFKvpDPT+uckfTaduy4S4J+BOa16Q2ZmYxURhZciJC1JrQ07JX18mP3/QdI/pv3fk/SGRmXWbfOOiF5JPcBqshryXRGxtcjFRsSApMuBeyWtBJbw6hRq+y94Clnt/SNFyjQzK0Mra9Rp7t9bgHcD/UCvpJ6I2F5z2JXA3og4QdJSsone31ev3CLNJqtS0AXATc1cdERsA+4E7gM+EBEv5w65FXg0IjYOd76kFZL6JPXd99KuZkKbmY1aNPFfAYuAnRGxK+XArwIX5Y65CLgjvb4XeJfSDO0jKZK8ZwMzgMOBaUWuNOdk4Dng92s3SroBeC1wzUgnRkRXRCyIiAUXHPofRxHazKx5gxGFlwKOBZ6qWe9P24Y9JiIGgOeBI+sVWiR5dwHXkd1kvLHIle4n6U/TBbwTuFnSEWn7VcC5wMURUYWHmcxsAmnmhmVtC0FaVuSKG64Gnc/6RY75HXXbvCUtAwYi4u7UbrNJ0tkR8W1JG8l6jMyQ1A9cGRHra849Cvg88K6IeErSl4AvAsuBvwP+L/BY+svgGxGxqt61mJmVpZk274joIqvkjqQfOK5mfQ6we4Rj+iUdAswEnq0Xt9ENy26gO70epOaGY0Sc3uDcXwIn1qzfXPPaDweZ2UGrxQ/p9AJzJR0P/D9gKfD+3DE9ZBXbx4A/A74dDS7CSdTMLKeVvU1Sz7urgfXAZGBNRGyTtAroi4ge4O+BOyXtJKtxL21UrpO3mVlOqwemioh1wLrctutrXu8D3ttMmU7eZmY5gxXoR+HkbWaWU4WBqZy8zcxyPCSsmVkFeTKGFpo3f0/pMZds6Uy712PzF3ck7kl/dE/pMX/2temlxwQ4/t4rOhL3PZ8rNDRQyz36+LdKj3lIh77HrTDkZhMzs+pxzdvMrILc28TMrILcbGJmVkFuNjEzqyDXvM3MKsg1bzOzChqMwU5fQkNO3mZmOX483sysgqrweHyRadAOIGm+pMckbZO0RdIBsxxL+oSklyVdltt+STpni6RNkt4y2os3M2uHiCi8dMpoa94vAssiYoekY4DvS1ofEc8BSLqUbI7Kk4B7Jf08Ijakc38KnBEReyWdRzZ90KnDxDAz64gq9DZpWPOWtDDVkqdJmi5pGzA1InYARMRuYA/ZTPBIOodsOp/zI2InsBj4tKT56fhNEbE3Fb+ZbD43M7ODRjTxX6c0rHlHRK+kHmA1cChwV0T8dnQdSYuAqcBP0vEPAQ/VnL8HOG2E4q8EHhwpdpqFeQXAX580l2Vzjm50uWZmYzaeHo9fRTaJ5j5g5f6Nko4G7gSWRzT3biWdRZa83zHSMbWzMv/y3DMO/r9jzGxcGE+9TWYDM4ApwDTgBUmvAR4Aro2Izc0ElXQKcDtwXkQ808y5ZmbtNi7avJMu4DpgLXCjpKnAPwHdEfH1ZgJKeh3wDeCyiHiymXPNzMowLnqbSFoGDETE3ZImA5vIpqV/J3CkpMvToZdHxOMFYl4PHAncKolU9oLRXLyZWTtUoZ93kRuW3UB3ej3Iq936ukcTMCKuAq4azblmZmUYT23eZmYTxnjqbWJmNmFU4Yalk7eZWY6bTczMKsjjeZuZVZBr3mZmFVSFNm9V4RdmrCStSI/aj+uYjju+406k99rJuFUxqvG8K2jFBInpuOM77kR6r52MWwkTJXmbmY0rTt5mZhU0UZJ3J9rNOtVW57jjN+5Eeq+djFsJE+KGpZnZeDNRat5mZuOKk7eZWQVNiOQtabmkHWlZXmLcb0p6TtL9JcWbL+kxSdvSpNHva3H5dT9HSV+W9IKks3Pbr5G0PV3Tw5Je38q4Dc79rKSnJP26mfPGElfSYZIekPTj9G/x+SbOrfudkfQJSS9Luiy3/ZL0+W6RtEnSW4rGLBK3wblrJT0haaukNZKmFDyv4fe1Xe93XGhmxogqLmRTuO1K/5+VXs8qKfa7gAuA+0uKdyIwN70+BngaOKKMzxG4FvgaMA/4EXBKzb6zgMPS6w8B/1jWvx/wNuBo4NdlfW+Aw4Cz0uupwEayKf/G9J0BLgW+C5wAPA68u2bfafuvDzgP+F5Z31XgfEBpuQf4UCu+r+18v+NhGVc1b0kL0y/xNEnTJW0DPgxsiIhnI2IvsAFY0u64kuZFxMPAr1oZq15MYGpE7ACIiN3AHuC1LSp7xM8x1UrnAe+PiK3AhcBtko5L1/JIRLyYit8MzCkSU9I84NyR4tacOzPV/N6Y1u+R9MEUe3NEPN3Mex1r3Ih4MSIeSfFfBn6Qf8/NfmcknQMsB86PiJ3AYuDTkuanOJvSdY7mM6bRd1XSIZJ6JZ2Z1j8n6bPp3HWRAP9cNDZ1vq+teL/j3bga2yQieiX1AKuBQ4G7gFeAp2oO6weObXfclMTaplFMSYvIan0/aUXZ1PkcI+IO4I6a83fw6oxLeVcCDxZ9P5KWjBS35tznJV0NfEXSF8lqZLeN9r22Mq6kI8hqtF8sErfOdT4EPFSzvoes9jmcpj7jkWLmzh1QNuXhvZJWkv2Q/c6/cWouuQz4SLOx89/XVrzf8W5cJe9kFdAL7ANWAtcMc0w7+kfm45Zh2JiSjgbuBJZHjHpKkJZ/jpIuBRYAZxSMCdmf4g3jRsQGSe8FbgGabf9sS1xJh5A1I9wcEbsKxh0TSWeRJbN3jHDIqGNGxDZJdwL3AW9Pf1XUuhV4NCI2NhN7LN/XAu933BpXzSbJbGAGcDgwjazGdFzN/jnA7hLiluGAmJJeAzwAXBsRm1tY9pg+x/Rn8CeBCyPiNwVjUjSupEnAm4GXUjnNaFfcLmBHRPxNE3FHTdIpwO3ARRHxTJtingw8B/x+LvYNZE0ew/3Ijxh7LN/Xgu93/Op0o3urF6AHeD9ZovgS2Rfmp2Q3nWal17PbHbdm+5m06YblMO91KvAw8BcH0+cIvJXsz+G5zX6GReMCHyNLlqcDfcCU3P4Rb1i2Iy5Z88D/AiaV8Z0BXgfsBE4bzfe0SFzgT4Fvkd1ofJJ0c5FsQvFNwKFlfV+Lvt/xvHT8Alr6ZmAZ8I30ejLwPeBs4APpH3oncEWJcTcC/05WK+sHzm1zzGVkbdOP1yzzO/05krVd/qLmmnqKxkzrdeOmZPIj4PC0/j+AT6fXN6XPfij9/1PtjktWS4+0b/97vqqd3xmyGujemnh9TX7GdeMCR5El7OPS+krgjvR6gOzHeX/s69v9fS3yfsf74sfjzcwqaDy2eZuZjXtO3mZmFeTkbWZWQU7eZmYV5ORtZlZBTt5mZhXk5G1mVkH/H5Wy1rQ00mbOAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "sns.heatmap(pd.DataFrame(X_poly, columns=poly.get_feature_names()).corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(162500, 6)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x0</th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x0 x1</th>\n",
       "      <th>x0 x2</th>\n",
       "      <th>x1 x2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1667.0</td>\n",
       "      <td>2072.0</td>\n",
       "      <td>2047.0</td>\n",
       "      <td>3454024.0</td>\n",
       "      <td>3412349.0</td>\n",
       "      <td>4241384.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1611.0</td>\n",
       "      <td>1957.0</td>\n",
       "      <td>1906.0</td>\n",
       "      <td>3152727.0</td>\n",
       "      <td>3070566.0</td>\n",
       "      <td>3730042.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1601.0</td>\n",
       "      <td>1939.0</td>\n",
       "      <td>1831.0</td>\n",
       "      <td>3104339.0</td>\n",
       "      <td>2931431.0</td>\n",
       "      <td>3550309.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1643.0</td>\n",
       "      <td>1965.0</td>\n",
       "      <td>1879.0</td>\n",
       "      <td>3228495.0</td>\n",
       "      <td>3087197.0</td>\n",
       "      <td>3692235.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1604.0</td>\n",
       "      <td>1959.0</td>\n",
       "      <td>1921.0</td>\n",
       "      <td>3142236.0</td>\n",
       "      <td>3081284.0</td>\n",
       "      <td>3763239.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       x0      x1      x2      x0 x1      x0 x2      x1 x2\n",
       "0  1667.0  2072.0  2047.0  3454024.0  3412349.0  4241384.0\n",
       "1  1611.0  1957.0  1906.0  3152727.0  3070566.0  3730042.0\n",
       "2  1601.0  1939.0  1831.0  3104339.0  2931431.0  3550309.0\n",
       "3  1643.0  1965.0  1879.0  3228495.0  3087197.0  3692235.0\n",
       "4  1604.0  1959.0  1921.0  3142236.0  3081284.0  3763239.0"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)\n",
    "\n",
    "X_poly = poly.fit_transform(X)\n",
    "print(X_poly.shape)\n",
    "\n",
    "pd.DataFrame(X_poly, columns=poly.get_feature_names()).head() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x1a1f0d45c0>"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAAD8CAYAAADUv3dIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAYqElEQVR4nO3dfbRddX3n8feHxJTwWJR2BhOotIYOmFIteagLlKCi0aVmLZcPhEUQjY2dilJpp9iRQQekVEedAqXWVCmQKgyi7USNE1xU0AoJ90JbarBM0sTKNVAYBbWLILn3fuePvS8cDufus8+5Zz/yebn28py9z9nnk9y7vvnxO78HRQRmZlaOA6oOYGb2bOKia2ZWIhddM7MSueiamZXIRdfMrEQuumZmJXLRNTObhaSrJT0k6TuzXJekKyTtknSPpN/od08XXTOz2V0DrM64/lpgSXpsAD7V74YuumZms4iIbwI/ynjJGuC6SGwDfl7SUVn3nD/KgL3s/3+7GzXlbeHzX1Z1hIE9cOoLq44wsINff3zVEQai406oOsLA7jjn76qOMLBX/NuNmus9Bqk5C37hV95N0kKdsTEiNg7wcYuA+zueT6TnHpjtDYUXXTOzukoL7CBFtluvfyQyi76Lrpm1y/RUmZ82ARzd8XwxsDfrDe7TNbN2mZrMf8zdZuDsdBTDbwI/johZuxbALV0za5mI6ZHdS9L1wCrgSEkTwIeA5ySfE38ObAFeB+wCHgPe0e+eLrpm1i7Toyu6EbG2z/UA3jPIPV10zaxdRtjSLYKLrpm1S7lfpA3MRdfM2sUtXTOz8sRoRiUUxkXXzNplhF+kFcFF18zaxd0LZmYl8hdpZmYlckvXzKxE/iLNzKxE/iLNzKw8EQ3u05V0OMlWFYtI1ojcC2yNiEdLyGZmNria9+nOurSjpLOBu0lW2DkIOBg4DbgrvTYrSRskjUsa/8x1148wrplZH9PT+Y8KZLV0Pwic1N2qlXQEsB24brY3dq7G3rTtesys4Wre0s0quqL3thPT9N6iwsyselP7q06QKavoXgrcLelmntp47RjgdOCSooOZmQ2l5qMXZu3TjYhrgeXAD4CfAU8AtwLLgO+VkM3MbHAxnf+oQObohYj4kaQzgE3Ax4CFwEdJCu9Li49nZjagprZ0O6wk2e3yduBOkmFjJxcZysxsaA0evTBjP7CPpJV7ILAnRrnzm5nZCEXNv0jL09IdIym6y4FTgLWSbio0lZnZsJrcp5taHxHj6eMHgTWS1hWYycxseDXv0+1bdDsKbue5TcXEMTObo5r3fnrBGzNrl6a3dM3MGsUtXTOzEk16EXMzs/K4pWtmVqKa9+nmGadrZtYcIxynK2m1pPsk7ZL0gR7Xj5H0DUl/L+keSa/rd08XXTNrlxFNA5Y0D7gKeC1wAsnEsBO6XnYhcGNEvAQ4A/izfvEK715Y+PyXFf0RI7Vv77eqjjCwFUubN1flZr5bdYSBnDj2zaojDOzOJUdVHaEao+vTXQHsiojdAJJuANYA93Z+GnBY+vhwkrVpMrlP18zaZYDRC5I2ABs6Tm1Md76BZG/I+zuuTZAsANbpw8DNkt5LsqXZq/p9pouumbVL5N8hrHNrsR567ZDTffO1wDUR8QlJLwU2SVqatSiYi66ZtcvoRi9MkCxrO2Mxz+w+WE+yYzoRcYekA4EjgYdmu6m/SDOzdhnderpjwBJJx0paQPJF2eau13wfeCWApONJlr99OOumbumaWbuM6Iu0iJiUdC6wFZgHXB0ROyRdDIxHxGbg94C/kPR+kq6HcyKy+zdcdM2sXaamRnariNgCbOk6d1HH43sZcCcdF10za5eaz0hz0TWzdnHRNTMrkRe8MTMrT0znH6dbBRddM2sXdy+YmZVohKMXiuCia2bt4paumVmJXHTNzEo0wII3VXDRNbN2cUvXzKxENR8yNtQqY5JOH3UQM7ORmJrKf1Rg2JbuZ4FjZrvYuRq75h3OAQccPOTHmJkNJpravSCpe93IJy8Bz8u6aedq7PMXLKp3W9/M2qXm3QtZLd2XAWcB/951XiQbtpmZ1U+D117YBjwWEbd1X5B0X3GRzMzmoKkt3Yh4LYCkE9KFejtd1OMtZmbVm6z3NOA8oxdulHSBEgslXQlcVnQwM7OhxHT+owJ5iu5Kkh0xbyfZqG0vA25PYWZWmunIf1Qgz5Cx/cA+YCHJTpd7svZ0NzOrUt2HjOVp6Y6RFN3lwCnAWkk3FZrKzGxYLWjpro+I8fTxg8AaSesKzGRmNrymjl6Y0VFwO89tKiaOmdkceRFzM7PyeI80M7MyueiamZWo5qMXXHTNrF3c0jUzK1HNi+5Qi5ibmdVVTE3nPvqRtFrSfZJ2SfrALK95q6R7Je2Q9Pl+9yy8pfvAqS8s+iNGasXS5g1BvvM7zRvBt3Ple6uOMJA9t11QdYSBxcPfrzpCNUbU0pU0D7gKOB2YAMYkbe5cAEzSEuAPgZMj4hFJv9jvvu5eMLNWGeGQsRXArojYDSDpBmAN0Lnq4m8BV0XEIwAR8VC/m7p7wczaZYBpwJI2SBrvODZ03GkRcH/H84n0XKfjgOMkfVvSNkmr+8VzS9fM2mWAEWOdW4v1oF5v6Xo+H1gCrAIWA9+StDQiHp3tM110zaxVYnJk43QnSJa1nbGYZGnb7tdsi4j9wJ50V50lJAuF9eTuBTNrl+kBjmxjwBJJx0paAJwBdG/Y+zfAaQCSjiTpbtiddVO3dM2sVUb1RVpETEo6F9gKzAOujogdki4GxiNic3rt1ZLuBaaA/xIRP8y6r4uumbXLCGcBR8QWYEvXuYs6Hgdwfnrk4qJrZq3iVcbMzMpU7/VuXHTNrF1isuoE2Vx0zaxV6r5trouumbWLi66ZWXnc0jUzK5GLrplZiWKq15IJ9eGia2at4paumVmJYrreLd3MBW8kHSbpV3qcP7G4SGZmw4vp/EcVZi26kt4K/DPwxXTvn+Udl6/JumnnwsDXTTwwmqRmZjlEKPdRhayW7n8FToqIFwPvADZJelN6LTNtRGyMiGURsezsxUeNKKqZWX91b+lm9enOi4gHACLiTkmnAV+RtJhnrp5uZlYL0zUfvZDV0v1pZ39uWoBXkWzM9qKCc5mZDSWmlfuoQlZL9z8DB0g6YWbL4Yj4abrx2hmlpDMzG1BjRy9ExD9GxE7gRkkXKLEQ+CTwO6UlNDMbQET+owp59khbSbI52+0kewbtBU4uMpSZ2bCa3L0wYz+wD1gIHAjsiaj7nA8ze7aqaihYXnlaumMkRXc5cAqwVtJNhaYyMxvS1JRyH1XI09JdHxHj6eMHgTWS1hWYycxsaHVv6fYtuh0Ft/PcpmLimJnNTd1HL3jBGzNrlapGJeTlomtmreKWrplZiaam84wPqI6Lrpm1irsXzMxKNN300QtmZk1S9yFj9e78MDMb0CjXXpC0WtJ9knZJ+kDG694sKSQt63fPwlu6B7/++KI/YqRu5rtVRxjYzpXvrTrCwJZsv7LqCAN5/CPnVR1hYFq4oOoIg3vVb8/5FqPqXpA0D7gKOB2YAMYkbZ5ZdbHjdYcC7wO257mvW7pm1ipT0wfkPvpYAeyKiN0R8QRwA8l64t0uAT4GPJ4nn4uumbVKDHB07ueYHhs6brUIuL/j+UR67kmSXgIcHRFfyZvPX6SZWasM0r0QERuBjbNc7nWjJ3uCJR0A/E/gnAHiueiaWbuMcPTCBMla4jMWk6wnPuNQYClwqySA/whslvTGXmvWzHDRNbNWGeFi32PAEknHAj8g2abszJmLEfFj4MiZ55JuBX4/q+CC+3TNrGUC5T4y7xMxCZwLbAW+C9wYETskXSzpjcPmc0vXzFplcoSTIyJiC7Cl69xFs7x2VZ57uuiaWav0a8FWzUXXzFql7hs4uuiaWau4pWtmViK3dM3MSjTllq6ZWXlqvluPi66Ztcu0W7pmZuWp+W49Lrpm1i51/yJtqGnAknrOyDAzq9q0lPuowrBrL7wr62LnGpVX335v1kvNzEZqaoCjCrN2L0j6yWyXgIVZN+1co/Kxy3+77l0sZtYiTR698CiwPCL+rfuCpPt7vN7MrHJ1H72Q1b1wHfBLs1z7fAFZzMzmbJDteqowa0s3Ii7MuHZBMXHMzOam7t0Lfb9Ik7S+6/k8SR8qLpKZ2fCmBziqkGf0wislbZF0lKSlwDaSvYHMzGpnSvmPKvSdHBERZ0p6G/BPwGPA2oj4duHJzMyG0PjJEZKWAOcBXwS+B6yTdFDBuczMhlL37oU804C/DLwnIm5Rss/w+SS7ZL6o0GRmZkMY4RZphchTdFdExE8AIiKAT0jaXGwsM7Ph1L17IU+f7jNmpkXEzmLimJnNTVXTe/PyKmNm1ip1H6fromtmrdLo7gVJhwOrgUUks+b2Alsj4tESspmZDazuRXfWIWOSzgbuBlYBBwEHA6cBd6XXzMxqp7FrLwAfBE7qbtVKOgLYTrIgjplZrTS5T1f0/sdgOr1mZlY7TR69cClwt6SbgZn1c48BTgcuyfsBOu6E4dNV4MSxb1YdYWB7bmveom+Pf+S8qiMM5MALL686wsD2X//xqiNUYnqEHQeSVgOXA/OAz0TEH3ddP59kJ51J4GHgnRHxr1n3nLVPNyKuBZYBtwE/A54AbgWWRcQ1Q/8pzMwKNKppwJLmAVcBrwVOANZK6m5F/j1JTTwRuAn4WL98maMXIuIRSfdExA1dYVZFxK39bm5mVrYRfkG2AtgVEbsBJN0ArAGe3PgxIr7R8fptwFn9bppnaccbJf2BEgslXQlcNlB0M7OSDNLS7dxENz02dNxqEU91rQJMpOdmsx74Wr98eSZHrAQ+CtxOso7u54CTc7zPzKx0k8rf1u3cRLeHXgMGet5c0lkk3bGn9vvMPEV3P7CPZAfgA4E9EVH38cdm9iw1wu6FCeDojueLSSaIPY2kV5EMsT01In7W76Z5uhfGSIrucuAUks7km/IkNjMr2wjX0x0Dlkg6VtIC4AzgaSssSnoJ8GngjRHxUJ58eVq66yNiPH38ILBG0ro8NzczK9uohoxFxKSkc4GtJEPGro6IHZIuBsYjYjPwP4BDgC8ky43z/Yh4Y9Z98yztON7j3KYh/gxmZoUb5fTeiNgCbOk6d1HH41cNek+vMmZmrVL3L5xcdM2sVaYqW8omHxddM2sVt3TNzEoUbumamZXHLV0zsxKNcpWxIrjomlmr1LvkuuiaWctM1rzsZk4DlvQaSeslvaDr/DuLDGVmNqwY4H9VyNqY8o9IFnH4NeAWSe/tuHxu1k07l0v77NduH01SM7McRrj2QiGyuhfeALwknX/8YeDzkn45It5Pnz3SOpdL2/e1K+rd1jezVqn7kLGs7oX5ETEJkO4I/AbgMElfABaUEc7MbFB1b+lmFd1/kfTkgrwRMRUR64H7gOMLT2ZmNoSpiNxHFbK6F97S62REXCjpUwXlMTObk7qP083aDXhfROyTtL7zfLpD5rsKT2ZmNoTGjl7o8EpJWyQdJWkpyY6Xhxacy8xsKHXv082ziPmZkt4G/BPwGLA2Ir5deDIzsyE0tnthhqQlwHnAF4HvAeskHVRwLjOzodS9eyHPNOAvA++JiFuUbAJ0PsmGbS8qNJmZ2RCqGpWQV56iuyIifgIQEQF8QtLmPu8xM6tE3bsX8vTp/qTHuZ3FxDEzmxuvp2tmVqK6TwN20TWzVml094Kkw4HVwCKStYH3AlvTtRjMzGonav5FWtbSjmcDdwOrgIOAg4HTgLvSa2ZmtTNF5D6qkNXS/SBwUnerVtIRwHbguiKDmZkNo8ndC6L3dkPT9FlP18ysKnXvXsgqupcCd0u6Gbg/PXcMcDpwSd4PuOOcvxs+XQXuXHJU1REGFg9/v+oIA9PCZi3JvP/6j1cdYWDPWfv7VUeoRN1bulmrjF0LLANuA34GPAHcCiyLiGvKCGdmNqhRTgOWtFrSfZJ2SfpAj+s/J+l/pde3d+8n2Uvm6IWIeETSPRFxQ9cHrYqIW/smNjMr2aimAafL2F5F8l/3E8CYpM0RcW/Hy9YDj0TECyWdAXwUeFvWffMs7XijpD9QYqGkK4HLhvtjmJkVa5rIffSxAtgVEbsj4gngBmBN12vWANemj28iWQo38zuvPEV3JUlf7u0kC93sBU7O8T4zs9INUnQ7dy5Pjw0dt1rEU99nQdLaXdT1cU++Jt1T8sfA87Ly5ZmRth/YBywEDgT2RETdpzeb2bPUIKMXOncu76FXi7X75nle8zR5WrpjJEV3OXAKsFbSTTneZ2ZWuhF2L0wAR3c8X0zyX/o9XyNpPnA48KOsm+Zp6a6PiPH08YPAGknrcrzPzKx0I1zwZgxYIulY4AfAGcCZXa/ZDLwduAN4M/C30aepnWdpx/Ee5zblDG1mVqqpEfV+RsSkpHOBrcA84OqI2CHpYmA8IjYDnwU2SdpF0sI9o999vcqYmbXKKGekRcQWYEvXuYs6Hj8OvGWQe7romlmr1H1GmouumbWKFzE3MyvRdIMXvDEzaxy3dM3MSjSq0QtFcdE1s1Zx94KZWYnq3r2QtUfaPEnvlnSJpJO7rl1YfDQzs8FNR+Q+qpC19sKngVOBHwJXSPpkx7U3FZrKzGxIo1zEvAhZRXdFRJwZEX9CsrzjIZK+JOnn6LNHWudyaV/Zt3uUec3MMk3FVO6jCllF98lNrCJiMiI2AP8A/C1wSNZNI2JjRCyLiGWvX/jLo0lqZpZDROQ+qpBVdMclre48EREXA38JvKDIUGZmwxrh0o6FmHX0QkScNcv5zwCfKSyRmdkc1H0L9r6LmKejF+Z3PD9M0l8WG8vMbDhNHr0wYz6wXdKJkl5NsrDvXcXGMjMbTt1HL+RZxPwPJd0CbAceAV4eEbsKT2ZmNoS6TwPO073wcuBy4GLgVuBPJT2/4FxmZkOp++iFPNOAPw68JSLuBZD0JpJhY/+pyGBmZsNow9oLL414ahRxRHxJ0m0FZjIzG1rdRy/k6dN9xrSNiPhhMXHMzObG2/WYmZWo8S1dM7MmafzohV4knT7qIGZmo9CGyRG9fHakKczMRqSxQ8YkbZ7tEvC8YuKYmc1N3XeOyOrTfRlwFvDvXecFrCgskZnZHDT5i7RtwGMR8YwxuZLuKy6Smdnw6j45QnX/VyGLpA0RsbHqHHk1LS80L3PT8oIzP9vkWXvhhB7nVhWSZnAbqg4woKblheZlblpecOZnlTyjF26UdIESCyVdCVxWdDAzszbKU3RXAkcDt5OspbsXODnzHWZm1lOeorsf2AcsBA4E9kTUZspH0/qUmpYXmpe5aXnBmZ9V+n6RJukfgf8NXEIyPvfTwP6IeHPx8czM2iVP0V0WEeNd59ZFxKZCk5mZtVCjh4yZmTXNsGsv1IKkt0vamR5vrzpPP5L+j6RHJX2l6ix5SHqxpDsk7ZB0j6S3VZ1pxlx+9pIulXS/pO7ZloUZNq+kgyR9VdI/pz+HPy4yZ9dnD/37Kulzku6T9B1JV0t6ThEZm6ixLV1JzwXGgWVAkOxQfFJEPFJpsAySXgkcBLw7Il5fdZ5+JB0HRETsTPfFuws4PiIerTjXnH72kn4T+FdgZ0QcUljQpz5v6LySDgJWRsQ3JC0AbgH+KCK+VmTm9LOH/n2V9DpgJuPngW9GxKdGHLGRGtHSlbQ8bWkdKOlgSTuA9wBfj4gfpb+8XwdWV5s00SuvpKURcQvw06rz9TLL3/GCiNgJEBF7gYeAX6gyk6SlwGvo87OXdHja0vrV9Pn1kn4r/bNsi4gHmpA3Ih6LiG+kuZ8A7gYWl5CZfr+vkuZLGpuZLCXpMkmXpu/dEingzlFmbrpGLGIeEWPpqmcfIRm69lckQ9nu73jZBLCognjP0CtvRHyn4liZ+mWWtAJYAPxL1ZkkrabPzz4ifizpXOAaSZcDR0TEXzQ5r6SfB95Asjt3oZlzvndS0jnATZLeR/IPycquzM8B1gHnjSpz0zWi6KYuJpmc8TjwPuD8Hq+pU19Jd94m6JlZ0lHAJuDtFYzR7pVJPV73jJ99RHxd0luAq4BfLyzh0xWSV9J84HrgiojYPdLEc/hdjYgdkjYBXybZxPaJrpf8GUnXwrdGkrQFGtG9kHoucAhwKMkkjQmSmXIzFpPMlquL7rxN8IzMkg4DvgpcGBHb6pCJnD97SQcAx5NM7nlusTGfVFTejSR90H8y6sDM/Xf114BHgf/QeVLSh0i6o3o1kJ61mlR0NwL/Dfgc8FFgK/BqSUdIOgJ4dXquLrrzNsHTMqdf3Pw1cF1EfKEOmdJzeX/27we+C6wFyvoGfeR5JX0EOBz43RIz5yLpTSSTpl4OXJF2gSDpXSR92WtrNIO1HgbZ2qKqAzgb+FL6eB6wHXgF8E5gV3q8o+qcOfJ+C3iYpCUzAbym6qx9Mp9N0nf+Dx3Hi6v+e0yfZ/7sgeNICtih6fNPAv89ffyx9O9/Ov3/D9c1L0mrONJrMz+Dd5X0d5z5+wocCfxf4Oj0+fuAa9PHkyT9/zOZL6r6d7wuR2OHjJmZNVGTuhfMzBrPRdfMrEQuumZmJXLRNTMrkYuumVmJXHTNzErkomtmVqL/DwnXj95diKwaAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.heatmap(pd.DataFrame(X_poly, columns=poly.get_feature_names()).corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/super/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n",
      "  warnings.warn(CV_WARNING, FutureWarning)\n",
      "/Users/super/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:657: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-27-8af7d94f39fe>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mgrid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpipe\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpipe_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_score_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_params_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m    685\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    686\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    689\u001b[0m         \u001b[0;31m# For multi-metric evaluation, store the best_index_, best_params_ and\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[0;34m(self, evaluate_candidates)\u001b[0m\n\u001b[1;32m   1146\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1147\u001b[0m         \u001b[0;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1148\u001b[0;31m         \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1149\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[0;34m(candidate_params)\u001b[0m\n\u001b[1;32m    664\u001b[0m                                \u001b[0;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    665\u001b[0m                                in product(candidate_params,\n\u001b[0;32m--> 666\u001b[0;31m                                           cv.split(X, y, groups)))\n\u001b[0m\u001b[1;32m    667\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    668\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m    922\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_iterator\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    923\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 924\u001b[0;31m             \u001b[0;32mwhile\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    925\u001b[0m                 \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    926\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m    757\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    758\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 759\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    760\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    761\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m    714\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    715\u001b[0m             \u001b[0mjob_idx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 716\u001b[0;31m             \u001b[0mjob\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    717\u001b[0m             \u001b[0;31m# A job can complete so quickly than its callback is\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    718\u001b[0m             \u001b[0;31m# called before we get here, causing self._jobs to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[0;34m(self, func, callback)\u001b[0m\n\u001b[1;32m    180\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    181\u001b[0m         \u001b[0;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 182\u001b[0;31m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    183\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    184\u001b[0m             \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m    547\u001b[0m         \u001b[0;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    548\u001b[0m         \u001b[0;31m# arguments in memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 549\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    550\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    551\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    223\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    224\u001b[0m             return [func(*args, **kwargs)\n\u001b[0;32m--> 225\u001b[0;31m                     for func, args, kwargs in self.items]\n\u001b[0m\u001b[1;32m    226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    227\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    223\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    224\u001b[0m             return [func(*args, **kwargs)\n\u001b[0;32m--> 225\u001b[0;31m                     for func, args, kwargs in self.items]\n\u001b[0m\u001b[1;32m    226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    227\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[0;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)\u001b[0m\n\u001b[1;32m    552\u001b[0m         \u001b[0mfit_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    553\u001b[0m         \u001b[0;31m# _score will return dict if is_multimetric is True\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 554\u001b[0;31m         \u001b[0mtest_scores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    555\u001b[0m         \u001b[0mscore_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mfit_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    556\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_score\u001b[0;34m(estimator, X_test, y_test, scorer, is_multimetric)\u001b[0m\n\u001b[1;32m    595\u001b[0m     \"\"\"\n\u001b[1;32m    596\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 597\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_multimetric_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    598\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    599\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_multimetric_score\u001b[0;34m(estimator, X_test, y_test, scorers)\u001b[0m\n\u001b[1;32m    625\u001b[0m             \u001b[0mscore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    626\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 627\u001b[0;31m             \u001b[0mscore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    628\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    629\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'item'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/scorer.py\u001b[0m in \u001b[0;36m_passthrough_scorer\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m    238\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_passthrough_scorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    239\u001b[0m     \u001b[0;34m\"\"\"Function that wraps estimator.score\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/utils/metaestimators.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    115\u001b[0m         \u001b[0;31m# lambda, but not partial, allows help() to work with update_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m         \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    117\u001b[0m         \u001b[0;31m# update the docstring of the returned function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    118\u001b[0m         \u001b[0mupdate_wrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mscore\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m    600\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    601\u001b[0m             \u001b[0mscore_params\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'sample_weight'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 602\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mscore_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    603\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    604\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36mscore\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m    355\u001b[0m         \"\"\"\n\u001b[1;32m    356\u001b[0m         \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mmetrics\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    358\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    359\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/neighbors/classification.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m    147\u001b[0m         \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 149\u001b[0;31m         \u001b[0mneigh_dist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneigh_ind\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkneighbors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    150\u001b[0m         \u001b[0mclasses_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclasses_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    151\u001b[0m         \u001b[0m_y\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_y\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/neighbors/base.py\u001b[0m in \u001b[0;36mkneighbors\u001b[0;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[1;32m    452\u001b[0m                 delayed_query(\n\u001b[1;32m    453\u001b[0m                     self._tree, X[s], n_neighbors, return_distance)\n\u001b[0;32m--> 454\u001b[0;31m                 \u001b[0;32mfor\u001b[0m \u001b[0ms\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgen_even_slices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    455\u001b[0m             )\n\u001b[1;32m    456\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m    919\u001b[0m             \u001b[0;31m# remaining jobs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    920\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 921\u001b[0;31m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    922\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_iterator\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    923\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m    757\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    758\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 759\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    760\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    761\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m    714\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    715\u001b[0m             \u001b[0mjob_idx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 716\u001b[0;31m             \u001b[0mjob\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    717\u001b[0m             \u001b[0;31m# A job can complete so quickly than its callback is\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    718\u001b[0m             \u001b[0;31m# called before we get here, causing self._jobs to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[0;34m(self, func, callback)\u001b[0m\n\u001b[1;32m    180\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    181\u001b[0m         \u001b[0;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 182\u001b[0;31m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    183\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    184\u001b[0m             \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m    547\u001b[0m         \u001b[0;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    548\u001b[0m         \u001b[0;31m# arguments in memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 549\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    550\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    551\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    223\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    224\u001b[0m             return [func(*args, **kwargs)\n\u001b[0;32m--> 225\u001b[0;31m                     for func, args, kwargs in self.items]\n\u001b[0m\u001b[1;32m    226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    227\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    223\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    224\u001b[0m             return [func(*args, **kwargs)\n\u001b[0;32m--> 225\u001b[0;31m                     for func, args, kwargs in self.items]\n\u001b[0m\u001b[1;32m    226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    227\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/neighbors/base.py\u001b[0m in \u001b[0;36m_tree_query_parallel_helper\u001b[0;34m(tree, data, n_neighbors, return_distance)\u001b[0m\n\u001b[1;32m    289\u001b[0m     \u001b[0munder\u001b[0m \u001b[0mPyPy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    290\u001b[0m     \"\"\"\n\u001b[0;32m--> 291\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mtree\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_neighbors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_distance\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    292\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    293\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "pipe_params = {'poly_features__degree':[1, 2, 3], 'poly_features__interaction_only':[True, False], 'classify__n_neighbors':[3, 4, 5, 6]}\n",
    "\n",
    "pipe = Pipeline([('poly_features', poly), ('classify', knn)])\n",
    "\n",
    "grid = GridSearchCV(pipe, pipe_params)\n",
    "grid.fit(X, y)\n",
    "\n",
    "print(grid.best_score_, grid.best_params_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. 针对文本的特征构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ItemID</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>SentimentText</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>is so sad for my APL frie...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>I missed the New Moon trail...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>omg its already 7:30 :O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>.. Omgaga. Im sooo  im gunna CRy. I'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>i think mi bf is cheating on me!!!   ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ItemID  Sentiment                                      SentimentText\n",
       "0       1          0                       is so sad for my APL frie...\n",
       "1       2          0                     I missed the New Moon trail...\n",
       "2       3          1                            omg its already 7:30 :O\n",
       "3       4          0            .. Omgaga. Im sooo  im gunna CRy. I'...\n",
       "4       5          0           i think mi bf is cheating on me!!!   ..."
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweets = pd.read_csv('~/开发/AI相关/特征工程/Feature-Engineering-Made-Easy-master/data/twitter_sentiment.csv', encoding='latin1')\n",
    "tweets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>SentimentText</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>is so sad for my APL frie...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>I missed the New Moon trail...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>omg its already 7:30 :O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>.. Omgaga. Im sooo  im gunna CRy. I'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>i think mi bf is cheating on me!!!   ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Sentiment                                      SentimentText\n",
       "0          0                       is so sad for my APL frie...\n",
       "1          0                     I missed the New Moon trail...\n",
       "2          1                            omg its already 7:30 :O\n",
       "3          0            .. Omgaga. Im sooo  im gunna CRy. I'...\n",
       "4          0           i think mi bf is cheating on me!!!   ..."
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del tweets['ItemID']\n",
    "tweets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 105849)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = tweets['SentimentText']\n",
    "y = tweets['Sentiment']\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vect = CountVectorizer()\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 105545)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(stop_words='english')  # removes a set of english stop words (if, a, the, etc)\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 31)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(min_df=.05)  # only includes words that occur in at least 5% of the corpus documents\n",
    "# used to skim the number of features\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 105849)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(max_df=.8)  # only includes words that occur at most 80% of the documents\n",
    "# used to \"Deduce\" stop words\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 3219557)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(ngram_range=(1, 5))  # also includes phrases up to 5 words\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape  # explodes the number of features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['00',\n",
       " '00 01',\n",
       " '00 01 minute',\n",
       " '00 01 minute after',\n",
       " '00 01 minute after applications',\n",
       " '00 10',\n",
       " '00 10 00',\n",
       " '00 56',\n",
       " '00 56 03',\n",
       " '00 56 03 beating',\n",
       " '00 56 03 beating my',\n",
       " '00 always',\n",
       " '00 always have',\n",
       " '00 always have such',\n",
       " '00 always have such warmth']"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect.get_feature_names()[:15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 105849)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(lowercase=True)  # lower cases everything first\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape  # features stays the same"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 1000)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(max_features=1000)  # hard limits the features  based on max counts\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 105849)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(analyzer='word')  # default analyzer, decides to split into words\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 153)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(analyzer='char')  # used characters\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\\t', '\\n', ' ', '!', '\"', '#', '$', '%', '&', \"'\"]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect.get_feature_names()[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 149)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(analyzer='char_wb')  # uses characters again but only those are aren't at the beginning or ends of words\n",
    "# wb stands for word boudnaries\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem.snowball import SnowballStemmer\n",
    "\n",
    "stemmer = SnowballStemmer('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hello'"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stemmer.stem('hello')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'interest'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stemmer.stem('interesting')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stemmer.stem('interesting') == stemmer.stem('interest')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def word_tokenize(text, how='lemma'):\n",
    "    words = text.split(' ')  # tokenize into words\n",
    "    return [stemmer.stem(word) for word in words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['hello', 'you', 'are', 'veri', 'interest']"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_tokenize(\"hello you are very interesting\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(99989, 154397)"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(analyzer=word_tokenize)\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape  # fewer features as stemming makes words smaller"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((99989, 105849), 2.1863060975751192e-05)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "vect = CountVectorizer()\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape, _[0,:].mean()\n",
    "\n",
    "vect = TfidfVectorizer()\n",
    "_ = vect.fit_transform(X)\n",
    "_.shape, _[0,:].mean()  # same number of rows and columns, different cell values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
